{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# K-means算法"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### K-means算法是一类典型的聚类算法，该算法的思想是最小化平方误差\n",
    "\n",
    "过程\n",
    "\n",
    "1. 从D中随机选择k个样本作为初始均值向量\n",
    "2. repeat\n",
    "   1. 令 C_i = 0\n",
    "   2. for j=1,2,...m\n",
    "       计算 $d_{ji}$ = ${||x_j-\\mu_i||}_2$\n",
    "       \n",
    "       选择最近的均值向量$x_j$,放入C_i\n",
    "   3. 计算新的均值向量\n",
    "       $\\hat{\\mu} = \\frac{1}{|C_i|}\\sum{x}$\n",
    "       \n",
    "       更新当前均值向量\n",
    "   4. 结束\n",
    "   \n",
    "### 这里根据周志华老师的西瓜数据集进行聚类操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from collections import defaultdict\n",
    "from utils.metric import euclidean_distance\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "watermelon = np.array([[ 0.697  ,0.46 ],\n",
    "                         [ 0.774  ,0.376],\n",
    "                         [ 0.634  ,0.264],\n",
    "                         [ 0.608  ,0.318],\n",
    "                         [ 0.556  ,0.215],\n",
    "                         [ 0.403  ,0.237],\n",
    "                         [ 0.481  ,0.149],\n",
    "                         [ 0.437  ,0.211],\n",
    "                         [ 0.666  ,0.091],\n",
    "                         [ 0.243  ,0.267],\n",
    "                         [ 0.245  ,0.057],\n",
    "                         [ 0.343  ,0.099],\n",
    "                         [ 0.639  ,0.161],\n",
    "                         [ 0.657  ,0.198],\n",
    "                         [ 0.36   ,0.37 ],\n",
    "                         [ 0.593  ,0.042],\n",
    "                         [ 0.719  ,0.103],\n",
    "                         [ 0.359  ,0.188],\n",
    "                         [ 0.339  ,0.241],\n",
    "                         [ 0.282  ,0.257],\n",
    "                         [ 0.748  ,0.232],\n",
    "                         [ 0.714  ,0.346],\n",
    "                         [ 0.483  ,0.312],\n",
    "                         [ 0.478  ,0.437],\n",
    "                         [ 0.525  ,0.369],\n",
    "                         [ 0.751  ,0.489],\n",
    "                         [ 0.532  ,0.472],\n",
    "                         [ 0.473  ,0.376],\n",
    "                         [ 0.725  ,0.445],\n",
    "                         [ 0.446  ,0.459]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def kmeans(data,k=3):\n",
    "    m = data.shape[0]\n",
    "    index = random.sample(range(m),k)\n",
    "    mu = data[index] #随机选择初始均值向量\n",
    "\n",
    "\n",
    "    while True:\n",
    "\n",
    "        C = defaultdict(list)\n",
    "\n",
    "        for j in range(0,m):\n",
    "            dij = [euclidean_distance(data[j],mu[i]) for i in range(k)]\n",
    "            lambda_j = np.argmin(dij)   #选择最小的值得下标\n",
    "\n",
    "            C[lambda_j].append(data[j].tolist())\n",
    "\n",
    "        new_mu = [np.mean(C[i],axis=0).tolist() for i in range(k)]\n",
    "\n",
    "        if (distance(np.array(new_mu),np.array(mu))>1e-9):\n",
    "            mu = new_mu\n",
    "        else:\n",
    "            break\n",
    "\n",
    "    return C,mu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 2\n",
    "res,mu = kmeans(watermelon,k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFGZJREFUeJzt3WGMXFd5h/Hn7eJUCy3etjZFWTuNEcFpSAyhS5BIpdJG\nyKE0shOiEKiKCpWiVAopSLVwKtWKwocE+UNoSCCKIsS3RpYwq4SYLpKtlhZayRsc1knAlRsK9kYV\nDqmNQKtm7bz9MDP2eLPeubOemTv3zvOTrN05e73z3r3OP3fPOfecyEwkSfXya2UXIEnqPcNdkmrI\ncJekGjLcJamGDHdJqiHDXZJqyHCXpBoy3CWphgx3SaqhN5T1xuvWrcvLL7+8rLeXpEp65plnXs7M\n9Z2OKy3cL7/8cmZnZ8t6e0mqpIj4SZHj7JaRpBoy3CWphgx3Saohw12Sashwl6QaMtwlqYYKhXtE\n3BgRRyLiaETsXObrH4iIUxHxbPPPrt6XKkkqquM894gYAx4BPggcBw5GxJOZ+cKSQ/81M/+sDzVK\nkrpU5M79OuBoZr6Yma8CTwDb+luWJOliFAn3SeBY2+vjzbal3h8RcxHxrYh453LfKCLuiIjZiJg9\nceLEKsqVJBXRqwHV7wOXZeYW4EvA9HIHZeZjmTmVmVPr13dcGkGStEpF1paZBza2vd7QbDsrM3/R\n9vm+iPhyRKzLzJd7U6YklWf60Dy7Z47w0skFLp0YZ8fWzWy/drkOjOFR5M79IHBFRGyKiEuA24En\n2w+IiLdGRDQ/v675fX/e62IladCmD81zz97DzJ9cIIH5kwvcs/cw04fmO/7dMnUM98w8DdwFzAA/\nBPZk5vMRcWdE3Nk87FbguYj4AfAQcHtmZr+KlqRB2T1zhIXFM+e1LSyeYffMkZIqKqbQkr+ZuQ/Y\nt6Tt0bbPHwYe7m1pklS+l04udNU+LHxCVZJWcOnEeFftw8Jwl6QV7Ni6mfE1Y+e1ja8ZY8fWzSVV\nVExpOzFJUhW0ZsVUbbaM4S6t1twe2H8fnDoOazfADbtgy21lV6U+2H7t5NCH+VKGu7Qac3vgqbth\nsTmodupY4zUY8BoK9rlLq7H/vnPB3rK40GiXhoDhLq3GqePdtUsDZrhLq7F2Q3ft0oAZ7irP3B54\n8Gq4d6LxcW5P2RUVd8MuWLNknvOa8Ua7+mb60DzXP3CATTuf5voHDgz9EgBlckBV5aj6gGSrRmfL\nDExrjZfWUgCtNV6Ays1kGYQoawmYqampnJ2dLeW9NQQevLoR6Eut3QiffW7w9WjoXf/AAeaXeeR/\ncmKc7+78kxIqKkdEPJOZU52Os1tG5XBAUl2q6hovZTHcVQ4HJNWlqq7xUhbDXeVwQFJdquoaL2Vx\nQFXlcEBSXbrYNV6quJvSxXBAVVLtLZ1pA427/vtvuaZyAe+AqiQ1VXU3pYthuEuqvVGcaWO4q/qq\n/KSrBmIUZ9oY7qq21pOup44Bee5JVwNebUZxpo3hrmpz6V0VsP3aSe6/5RomJ8YJGk+1VnEwtRtO\nhVS1+aSrCqribkoXwzt3VZtPukrLMtxVbT7pKi3LcFe1bbkNbnqosZok0fh400M+6aqRZ5+7qm/L\nbYa5tIR37pJUQ4a7JNWQ4S5JNWS4S1INGe6SVEOGuyTVUKFwj4gbI+JIRByNiJ0rHPfeiDgdEbf2\nrkRJUrc6hntEjAGPAB8CrgI+FhFXXeC4LwDf7nWRkqTuFLlzvw44mpkvZuarwBPAtmWO+zTwdeBn\nPaxPkrQKRcJ9EjjW9vp4s+2siJgEbga+0rvSJEmr1asB1S8Cn8vM11Y6KCLuiIjZiJg9ceJEj95a\nkrRUkbVl5oGNba83NNvaTQFPRATAOuBPI+J0Zk63H5SZjwGPAUxNTeVqi5YkraxIuB8EroiITTRC\n/Xbg4+0HZOam1ucR8TXgm0uDXZI0OB3DPTNPR8RdwAwwBnw1M5+PiDubX3+0zzVKkrpUaMnfzNwH\n7FvStmyoZ+ZfXnxZkqSL4ROqklRDhrsk1ZA7Mama5vbA/vvg1PHGZtg37HI3Jg2V6UPz7J45wksn\nF7h0YpwdWzez/drJzn+xRwx3Vc/cHnjqblhcaLw+dazxGgx4DYXpQ/Pcs/cwC4tnAJg/ucA9ew8D\nDCzg7ZZR9ey/71ywtywuNNqlIbB75sjZYG9ZWDzD7pkjA6vBcK+quT3w4NVw70Tj49yesisanFPH\nu2vvlVH+masrL51c6Kq9Hwz3Kmp1S5w6BuS5bolRCZu1G7pr74VR/5mrK5dOjHfV3g+GexWNerfE\nDbtgzZL/SNaMN9r7ZdR/5urKjq2bGV8zdl7b+JoxdmzdPLAaHFCtorK6JYZFa9B0kLNlRv1nrq60\nBk2dLaPurN3Q7B5Ypn1UbLltsDNj/JmrS9uvnRxomC9lt0wVldEtMer8matiDPcq2nIb3PQQrN0I\nROPjTQ85x7uf/JmrYiKznGXVp6amcnZ2tpT3lqSqiohnMnOq03HeuUtSDRnuklRDzpaRVCtlL9g1\nLAx3SbUxDAt2DQu7ZSTVxjAs2DUsDHdJtTEMC3YNC8NdUm0Mw4Jdw8Jwl1Qbw7Bg17BwQFVSbQzD\ngl3DwnCXVCtlL9g1LOyWkaQaMtwlqYYMd0mqIfvcpRHgI/mjx3CXas5H8keT3TJSzflI/mgy3FUf\nc3vgwavh3onGx7k9ZVc0FHwkfzQZ7qqHuT3w1N3NTayz8fGpuw14fCR/VBnuqof998HikjvRxYVG\n+4jzkfzR5ICq6uHU8e7aR4iP5I+mQuEeETcC/wCMAY9n5gNLvr4N+DzwGnAa+Exm/luPa5UubO2G\nZpfMMu3ykfwR1LFbJiLGgEeADwFXAR+LiKuWHLYfeFdmvhv4FPB4rwuVVnTDLlizpA95zXijXRpB\nRfrcrwOOZuaLmfkq8ASwrf2AzPxlZmbz5ZuARBqkLbfBTQ/B2o1AND7e9FCjXRpBRbplJoH233eP\nA+9belBE3AzcD7wF+HBPqpO6seU2w1xq6tlsmcz8RmZeCWyn0f/+OhFxR0TMRsTsiRMnevXWkqQl\nioT7PLCx7fWGZtuyMvM7wNsiYt0yX3ssM6cyc2r9+vVdFytJKqZIuB8EroiITRFxCXA78GT7ARHx\n9oiI5ufvAX4d+Hmvi5UkFdOxzz0zT0fEXcAMjamQX83M5yPizubXHwU+AnwiIhaBBeCjbQOskqQB\ni7IyeGpqKmdnZ0t5b0mqqoh4JjOnOh3n8gOSVEOGuyTVkOEuSTVkuEtSDRnuklRD1Qx3d9yRpBVV\nbz331o47rY0ZWjvugOuKSFJT9cJ9pR13DHepp6YPzbvJR0VVL9zdcUcaiOlD89yz9zALi2cAmD+5\nwD17DwMY8BVQvT73C+2s4447Uk/tnjlyNthbFhbPsHvmSEkVqRvVC/d+7rjjQK101ksnF7pq13Cp\nXrj3a8ed1kDtqWNAnhuoNeA1oi6dGO+qXcOlen3u0J8ddxyolc6zY+vm8/rcAcbXjLFj6+YSq1JR\n1Qz3fnCgVjpPa9DU2TLVZLi3rN3Q7JJZpl0aUduvnTTMK6p6fe790s+BWkkaMMO9pV8Dtd1wto6k\nHrFbpl0/BmqLclkFST3knfuwWGm2Tln8TUKqLO/ch8WwzdbxNwmp0rxzHxbDtqzCMP4mIQ3Y9KF5\nrn/gAJt2Ps31Dxxg+tB82SUVZrgPi2GbrTNsv0lIA9ZaOG3+5ALJuYXTqhLwhvuwGIbZOu2G7TcJ\nacCqvnCafe7DpMzZOkvdsOv8Pndw3r9GStUXTvPOXcsbtt8kpAGr+sJp3rnrwobpNwlpwKq+cJrh\nLknLqPrCaYa7NEDuSVotVV44zXCXBsQ9STVIDqhqMFzKoPJT61Qt3rmr/1zKAKj+1DpVi3fu6j+X\nMgCqP7VO1WK4q/9cygBoTK0bXzN2XluVptapWgqFe0TcGBFHIuJoROxc5ut/HhFzEXE4Ir4XEe/q\nfamqLJcyABqDpvffcg2TE+MEMDkxzv23XONgqvqiY597RIwBjwAfBI4DByPiycx8oe2wHwN/lJn/\nGxEfAh4D3tePglVBLmVwVpWn1hXldM/hUGRA9TrgaGa+CBARTwDbgLPhnpnfazv+P4DRuiXTylqD\npvvva3TFrN3QCPYRGkwdFU73HB5Fwn0SONb2+jgr35X/FfCt5b4QEXcAdwBcdtllBUtULbiUwUhY\nabqn4T5YPR1QjYg/phHun1vu65n5WGZOZebU+vXre/nWkoaA0z2HR5Fwnwc2tr3e0Gw7T0RsAR4H\ntmXmz3tTni7Ih4I0hJzuOTyKhPtB4IqI2BQRlwC3A0+2HxARlwF7gb/IzP/sfZk6T+uhoFPHgDz3\nUJABr5I53XN4dAz3zDwN3AXMAD8E9mTm8xFxZ0Tc2TxsF/A7wJcj4tmImO1bxfKhIA0tp3sOj8jM\nUt54amoqZ2f9f8Cq3DsBLHfdAu49OehqJA1QRDyTmVOdjvMJ1SryoSBJHRjuVXTDrsZDQO1G9KEg\nScsz3KvI/U0ldeCSv1XlQ0GSVuCduyTVkOEuSTVkuEtSDRnuklRDDqhKKsR12qvFcJfUkeu0V4/d\nMpI6Wmmddg0nw11SR67TXj2Gu6SOXKe9egx31YObl/SV67RXjwOqqr7W5iWtNe5bm5eASzT0SGvQ\n1Nky1eF67qq+B69u7kq1xNqN8NnnBl+P1Eeu567Rcep4d+3SCDDcVX1uXiK9juGu6nPzEul1DHdV\nn5uXSK/jbBnVg5uXSOfxzl2Sashwl6QaMtwlqYYMd0mqIcNdkmrIcJekGjLcJamGDHdJqiHDXZJq\nyHCXpBoy3CWphgqFe0TcGBFHIuJoROxc5utXRsS/R8T/RcTf9r5MSere9KF5rn/gAJt2Ps31Dxxg\n+tB82SUNTMeFwyJiDHgE+CBwHDgYEU9m5gtth70C3A1s70uVo2ZuD+y/r7HZxNoNjaVrXRRL6sr0\noXnu2XuYhcUzAMyfXOCevYcBRmJ7wCJ37tcBRzPzxcx8FXgC2NZ+QGb+LDMPAot9qHG0tPYDPXUM\nyHP7gbrhs9SV3TNHzgZ7y8LiGXbPHCmposEqEu6TQPsGlcebbeqH/fed2+i5ZXGh0S6psJdOLnTV\nXjcDHVCNiDsiYjYiZk+cODHIt64O9wOVeuLSifGu2uumSLjPAxvbXm9otnUtMx/LzKnMnFq/fv1q\nvkX9uR+oaqiMgc0dWzczvmbsvLbxNWPs2Lq57+89DIqE+0HgiojYFBGXALcDT/a3rBHmfqCqmdbA\n5vzJBZJzA5v9Dvjt105y/y3XMDkxTgCTE+Pcf8s1IzGYCgVmy2Tm6Yi4C5gBxoCvZubzEXFn8+uP\nRsRbgVngzcBrEfEZ4KrM/EUfa6+n1qwYZ8uoJlYa2Ox30G6/dnJkwnypQnuoZuY+YN+StkfbPv8f\nGt016gX3A1WNjPrAZll8QlVSX436wGZZDHdJfTXqA5tlKdQtI0mr1erz3j1zhJdOLnDpxDg7tm4e\n2b7wQTHcJfXdKA9slsVuGUmqIcNdkmrIcJekGjLcJamGDHdJqiHDXZJqqLrhPrcHHrwa7p1ofHQz\nC0k6q5rz3Fu7FbU2tWjtVgSuySJJVPXO3d2KJGlF1Qx3dyuSpBVVM9zdrUiSVlTNcHe3IklaUTUH\nVN2tSDpr+tC8Ky7qdaoZ7uBuRRLn9idtbWPX2p8UMOBHXDW7ZSQBK+9PqtFmuEsV5v6kuhDDXaow\n9yfVhRjuUoW5P6kupLoDqpLcn1QXZLhLFef+pFqO3TKSVEOGuyTVkOEuSTVkuEtSDRnuklRDhrsk\n1ZDhLkk1ZLhLUg0Z7pJUQ4a7JNWQ4S5JNRSZWc4bR5wAflLKm/fOOuDlsovoI8+v2jy/arvQ+f1e\nZq7v9JdLC/c6iIjZzJwqu45+8fyqzfOrtos9P7tlJKmGDHdJqiHD/eI8VnYBfeb5VZvnV20XdX72\nuUtSDXnnLkk1ZLh3EBE3RsSRiDgaETuX+fq2iJiLiGcjYjYi/rCMOler0/m1HffeiDgdEbcOsr5e\nKHANPxARp5rX8NmI2FVGnatV5Bo2z/HZiHg+Iv5l0DVejALXb0fbtXsuIs5ExG+XUetqFDi/tRHx\nVET8oHn9PlnoG2emfy7wBxgD/gt4G3AJ8APgqiXH/Abnure2AD8qu+5enl/bcQeAfcCtZdfdh2v4\nAeCbZdfax/ObAF4ALmu+fkvZdffy/JYcfxNwoOy6e3z9/g74QvPz9cArwCWdvrd37iu7DjiamS9m\n5qvAE8C29gMy85fZ/KkDbwKqNIjR8fyaPg18HfjZIIvrkaLnWFVFzu/jwN7M/ClAZlbpOnZ7/T4G\n/ONAKuuNIueXwG9GRNC4mXwFON3pGxvuK5sEjrW9Pt5sO09E3BwRPwKeBj41oNp6oeP5RcQkcDPw\nlQHW1UuFriHw/mb32rci4p2DKa0nipzfO4Dfioh/johnIuITA6vu4hW9fkTEG4EbadyIVEWR83sY\n+H3gJeAw8DeZ+Vqnb2y490BmfiMzrwS2A58vu54e+yLwuSL/mCrs+zS6LLYAXwKmS66n194A/AHw\nYWAr8PcR8Y5yS+qLm4DvZuYrZRfSY1uBZ4FLgXcDD0fEmzv9JcN9ZfPAxrbXG5pty8rM7wBvi4h1\n/S6sR4qc3xTwRET8N3Ar8OWI2D6Y8nqi4zlm5i8y85fNz/cBa2p2DY8DM5n5q8x8GfgO8K4B1Xex\nuvlv8Haq1SUDxc7vkzS61TIzjwI/Bq7s+J3LHlAY5j807nheBDZxbrDjnUuOeTvnBlTf07wwUXbt\nvTq/Jcd/jeoNqBa5hm9tu4bXAT+t0zWk8Sv9/uaxbwSeA64uu/ZenV/zuLU0+qLfVHbNfbh+XwHu\nbX7+u82MWdfpe7+hY/qPsMw8HRF3ATM0RrW/mpnPR8Sdza8/CnwE+ERELAILwEezeRWGXcHzq7SC\n53gr8NcRcZrGNby9TtcwM38YEf8EzAGvAY9n5nPlVV1cF/9Gbwa+nZm/KqnUVSl4fp8HvhYRh4Gg\n0U3acTVMn1CVpBqyz12Sashwl6QaMtwlqYYMd0mqIcNdkmrIcJekGjLcJamGDHdJqqH/B0AwnxpB\nPtPcAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x117d85198>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for i in range(k):\n",
    "    res_i = np.array(res[i])\n",
    "    plt.scatter(res_i[:,0],res_i[:,1])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
